DATA602 \ Michael Ippolito
# core
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# ml
from sklearn import datasets as ds
from sklearn import linear_model as lm
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import train_test_split as tts
#plotly or other graphing library
import plotly.express as px
import plotly.io as pio
from matplotlib import pyplot as plt
# Set plotly renderer
pio.renderers.default = 'notebook'
%%HTML
<!-- Set dataframe style -->
<style>.dataframe th{
background:#3f577c;
font-family:monospace;
color:white;
border:3px solid white;
text-align:left !important;}
</style>
# Load datasets here once and assign to variables iris and boston
# Load iris data
iris = ds.load_iris()
# Load boston data
boston = ds.load_boston()
Q1
Data set: Iris
# Create iris dataframe
iris_X = pd.DataFrame(iris['data'])
iris_y = iris['target']
iris_X.columns = iris['feature_names']
print("Iris features:")
display(iris_X.head())
print()
print("Iris target names:")
print(iris['target_names'])
Iris features:
| sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 |
Iris target names: ['setosa' 'versicolor' 'virginica']
Q2
Data set: Iris
# Fit to k=5 nearest neighbors
knn = KNN(n_neighbors=5)
knn.fit(iris_X, iris_y)
# Predict new observations
new_observations = knn.predict(iris_X)
print("New observations:")
print(iris['target_names'][new_observations])
New observations: ['setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'virginica' 'versicolor' 'virginica' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'virginica' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'versicolor' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'versicolor' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica']
Q3 15 pts
Data set: Iris
Split the Iris dataset into a train / test model with the split ratio between the two established by the function parameter split.
Fit KNN with the training data with number of neighbors equal to the function parameter neighbors
Generate and return back an accuracy score using the test data that was split out
# Set function parameters
split = 0.3
k = 5
# Split the data into train and test
(X_train, X_test, y_train, y_test) = tts(iris_X, iris_y, test_size=split, random_state=777)
# Fit the KNN model
knn = KNN(n_neighbors = k)
knn.fit(X_train, y_train)
# Generate score
knn_score = knn.score(X_test, y_test)
print('Accuracy:', knn_score)
Accuracy: 0.9777777777777777
Q4
Data set: Iris
# Set split rate for test/training data
split = 0.3
# Initialize dataframe to store accuracy scores for each model run
dfscores = pd.DataFrame(columns=['k', 'accuracy'])
display(dfscores)
# Iterate over range of k values
for k in range(1, 31):
# Split the data into train and test
(X_train, X_test, y_train, y_test) = tts(iris_X, iris_y, test_size=split, random_state=777)
# Fit the KNN model
knn = KNN(n_neighbors = k)
knn.fit(X_train, y_train)
# Generate score
knn_score = knn.score(X_test, y_test)
#print('Accuracy (k=' + str(k) + '):' + str(knn_score))
dfnew = pd.DataFrame({'k': k, 'accuracy': knn_score}, columns=dfscores.columns, index=[k])
dfscores = pd.concat([dfscores, dfnew], axis=0)
# Plot
fig = px.scatter(dfscores, x='k', y='accuracy', template='plotly_white')
fig.show()
| k | accuracy |
|---|
Q5 10 pts
Data set: Boston
Load sklearn's Boston data into a DataFrame (only the data and feature_name as column names)
Load sklearn's Boston target values into a separate DataFrame
Return back the average of AGE, average of the target (median value of homes or MEDV), and the target as NumPy values
# Load boston data into dataframes
boston_X = pd.DataFrame(boston['data'], columns=boston['feature_names'])
boston_y = pd.DataFrame(boston['target'], columns=['MEDV'])
# Print select averages
print('Average age:', np.mean(boston_X['AGE']))
print('Average median home value:', np.median(boston_y['MEDV']))
print()
# Print target values
print('Target values:')
boston_y_arr = np.array(boston_y)
print(boston_y_arr)
Average age: 68.57490118577076 Average median home value: 21.2 Target values: [[24. ] [21.6] [34.7] [33.4] [36.2] [28.7] [22.9] [27.1] [16.5] [18.9] [15. ] [18.9] [21.7] [20.4] [18.2] [19.9] [23.1] [17.5] [20.2] [18.2] [13.6] [19.6] [15.2] [14.5] [15.6] [13.9] [16.6] [14.8] [18.4] [21. ] [12.7] [14.5] [13.2] [13.1] [13.5] [18.9] [20. ] [21. ] [24.7] [30.8] [34.9] [26.6] [25.3] [24.7] [21.2] [19.3] [20. ] [16.6] [14.4] [19.4] [19.7] [20.5] [25. ] [23.4] [18.9] [35.4] [24.7] [31.6] [23.3] [19.6] [18.7] [16. ] [22.2] [25. ] [33. ] [23.5] [19.4] [22. ] [17.4] [20.9] [24.2] [21.7] [22.8] [23.4] [24.1] [21.4] [20. ] [20.8] [21.2] [20.3] [28. ] [23.9] [24.8] [22.9] [23.9] [26.6] [22.5] [22.2] [23.6] [28.7] [22.6] [22. ] [22.9] [25. ] [20.6] [28.4] [21.4] [38.7] [43.8] [33.2] [27.5] [26.5] [18.6] [19.3] [20.1] [19.5] [19.5] [20.4] [19.8] [19.4] [21.7] [22.8] [18.8] [18.7] [18.5] [18.3] [21.2] [19.2] [20.4] [19.3] [22. ] [20.3] [20.5] [17.3] [18.8] [21.4] [15.7] [16.2] [18. ] [14.3] [19.2] [19.6] [23. ] [18.4] [15.6] [18.1] [17.4] [17.1] [13.3] [17.8] [14. ] [14.4] [13.4] [15.6] [11.8] [13.8] [15.6] [14.6] [17.8] [15.4] [21.5] [19.6] [15.3] [19.4] [17. ] [15.6] [13.1] [41.3] [24.3] [23.3] [27. ] [50. ] [50. ] [50. ] [22.7] [25. ] [50. ] [23.8] [23.8] [22.3] [17.4] [19.1] [23.1] [23.6] [22.6] [29.4] [23.2] [24.6] [29.9] [37.2] [39.8] [36.2] [37.9] [32.5] [26.4] [29.6] [50. ] [32. ] [29.8] [34.9] [37. ] [30.5] [36.4] [31.1] [29.1] [50. ] [33.3] [30.3] [34.6] [34.9] [32.9] [24.1] [42.3] [48.5] [50. ] [22.6] [24.4] [22.5] [24.4] [20. ] [21.7] [19.3] [22.4] [28.1] [23.7] [25. ] [23.3] [28.7] [21.5] [23. ] [26.7] [21.7] [27.5] [30.1] [44.8] [50. ] [37.6] [31.6] [46.7] [31.5] [24.3] [31.7] [41.7] [48.3] [29. ] [24. ] [25.1] [31.5] [23.7] [23.3] [22. ] [20.1] [22.2] [23.7] [17.6] [18.5] [24.3] [20.5] [24.5] [26.2] [24.4] [24.8] [29.6] [42.8] [21.9] [20.9] [44. ] [50. ] [36. ] [30.1] [33.8] [43.1] [48.8] [31. ] [36.5] [22.8] [30.7] [50. ] [43.5] [20.7] [21.1] [25.2] [24.4] [35.2] [32.4] [32. ] [33.2] [33.1] [29.1] [35.1] [45.4] [35.4] [46. ] [50. ] [32.2] [22. ] [20.1] [23.2] [22.3] [24.8] [28.5] [37.3] [27.9] [23.9] [21.7] [28.6] [27.1] [20.3] [22.5] [29. ] [24.8] [22. ] [26.4] [33.1] [36.1] [28.4] [33.4] [28.2] [22.8] [20.3] [16.1] [22.1] [19.4] [21.6] [23.8] [16.2] [17.8] [19.8] [23.1] [21. ] [23.8] [23.1] [20.4] [18.5] [25. ] [24.6] [23. ] [22.2] [19.3] [22.6] [19.8] [17.1] [19.4] [22.2] [20.7] [21.1] [19.5] [18.5] [20.6] [19. ] [18.7] [32.7] [16.5] [23.9] [31.2] [17.5] [17.2] [23.1] [24.5] [26.6] [22.9] [24.1] [18.6] [30.1] [18.2] [20.6] [17.8] [21.7] [22.7] [22.6] [25. ] [19.9] [20.8] [16.8] [21.9] [27.5] [21.9] [23.1] [50. ] [50. ] [50. ] [50. ] [50. ] [13.8] [13.8] [15. ] [13.9] [13.3] [13.1] [10.2] [10.4] [10.9] [11.3] [12.3] [ 8.8] [ 7.2] [10.5] [ 7.4] [10.2] [11.5] [15.1] [23.2] [ 9.7] [13.8] [12.7] [13.1] [12.5] [ 8.5] [ 5. ] [ 6.3] [ 5.6] [ 7.2] [12.1] [ 8.3] [ 8.5] [ 5. ] [11.9] [27.9] [17.2] [27.5] [15. ] [17.2] [17.9] [16.3] [ 7. ] [ 7.2] [ 7.5] [10.4] [ 8.8] [ 8.4] [16.7] [14.2] [20.8] [13.4] [11.7] [ 8.3] [10.2] [10.9] [11. ] [ 9.5] [14.5] [14.1] [16.1] [14.3] [11.7] [13.4] [ 9.6] [ 8.7] [ 8.4] [12.8] [10.5] [17.1] [18.4] [15.4] [10.8] [11.8] [14.9] [12.6] [14.1] [13. ] [13.4] [15.2] [16.1] [17.8] [14.9] [14.1] [12.7] [13.5] [14.9] [20. ] [16.4] [17.7] [19.5] [20.2] [21.4] [19.9] [19. ] [19.1] [19.1] [20.1] [19.9] [19.6] [23.2] [29.8] [13.8] [13.3] [16.7] [12. ] [14.6] [21.4] [23. ] [23.7] [25. ] [21.8] [20.6] [21.2] [19.1] [20.6] [15.2] [ 7. ] [ 8.1] [13.6] [20.1] [21.8] [24.5] [23.1] [19.7] [18.3] [21.2] [17.5] [16.8] [22.4] [20.6] [23.9] [22. ] [11.9]]
Q6
Data set: Boston
In the Boston dataset, the feature PTRATIO refers to pupil teacher ratio.
Using a matplotlib scatter plot, plot MEDV median value of homes as y-axis and PTRATIO as x-axis
Return back PTRATIO as a NumPy array
# Plot MEDV vs PTRATIO
plt.scatter(x=boston_X['PTRATIO'], y=boston_y['MEDV'])
plt.xlabel('Pupil-teacher ratio')
plt.ylabel('Median home value ($1000s)')
plt.show()
# Print PTRATIO as numpy array
boston_X_arr = np.array(boston_X['PTRATIO'])
print(boston_X_arr)
[15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 15.2 15.2 15.2 21. 21. 21. 21. 21. 21. 21. 21. 21. 21. 21. 21. 21. 21. 21. 21. 21. 21. 21. 21. 21. 21. 19.2 19.2 19.2 19.2 18.3 18.3 17.9 17.9 17.9 17.9 17.9 17.9 17.9 17.9 17.9 16.8 16.8 16.8 16.8 21.1 17.9 17.3 15.1 19.7 19.7 19.7 19.7 19.7 19.7 18.6 16.1 16.1 18.9 18.9 18.9 19.2 19.2 19.2 19.2 18.7 18.7 18.7 18.7 18.7 18.7 19. 19. 19. 19. 18.5 18.5 18.5 18.5 17.8 17.8 17.8 17.8 18.2 18.2 18.2 18. 18. 18. 18. 18. 20.9 20.9 20.9 20.9 20.9 20.9 20.9 20.9 20.9 20.9 20.9 17.8 17.8 17.8 17.8 17.8 17.8 17.8 17.8 17.8 19.1 19.1 19.1 19.1 19.1 19.1 19.1 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 16.6 16.6 16.6 16.6 16.6 16.6 16.6 17.8 17.8 17.8 17.8 17.8 17.8 17.8 17.8 15.2 15.2 15.2 15.2 15.2 15.2 15.6 15.6 14.4 12.6 12.6 12.6 17. 17. 14.7 14.7 14.7 14.7 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 16.4 16.4 16.4 16.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 16.6 16.6 16.6 16.6 16.6 16.6 19.1 19.1 19.1 19.1 19.1 19.1 19.1 19.1 19.1 19.1 16.4 16.4 15.9 13. 13. 13. 13. 13. 13. 13. 13. 13. 13. 13. 13. 18.6 18.6 18.6 18.6 18.6 17.6 17.6 17.6 17.6 17.6 14.9 14.9 14.9 14.9 13.6 15.3 15.3 18.2 16.6 16.6 16.6 19.2 19.2 19.2 16. 16. 16. 16. 16. 14.8 14.8 14.8 16.1 16.1 16.1 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 19.6 19.6 19.6 19.6 19.6 19.6 19.6 19.6 16.9 16.9 16.9 16.9 16.9 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 15.5 15.9 17.6 17.6 18.8 18.8 17.9 17. 19.7 19.7 18.3 18.3 17. 22. 22. 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.1 20.1 20.1 20.1 20.1 19.2 19.2 19.2 19.2 19.2 19.2 19.2 19.2 21. 21. 21. 21. 21. ]
Q7
Data set: Boston
Use np.linspace() to generate prediction X values from min to max PTRATIO
Return back the regression prediction space and regression predicted values
Make sure to labels axes appropriately
# Fit linear model to data
linreg = lm.LinearRegression()
linreg.fit(boston_X_arr.reshape(-1, 1), boston_y_arr.reshape(-1, 1))
# Create prediction space x
x_test = np.linspace(np.min(boston_X_arr), np.max(boston_X_arr)).reshape(-1, 1)
# Predict y values
y_pred = linreg.predict(x_test)
# Print x and y
print("Prediction space (x):", x_test)
print()
print("Predicted values (y):", y_pred)
print()
# Plot x,y data along with regression line
plt.scatter(boston_X_arr, boston_y_arr)
plt.plot(x_test, y_pred, color='black', linewidth=3)
plt.xlabel('Pupil-teacher ratio')
plt.ylabel('Median home value ($1000s)')
plt.show()
Prediction space (x): [[12.6 ] [12.79183673] [12.98367347] [13.1755102 ] [13.36734694] [13.55918367] [13.75102041] [13.94285714] [14.13469388] [14.32653061] [14.51836735] [14.71020408] [14.90204082] [15.09387755] [15.28571429] [15.47755102] [15.66938776] [15.86122449] [16.05306122] [16.24489796] [16.43673469] [16.62857143] [16.82040816] [17.0122449 ] [17.20408163] [17.39591837] [17.5877551 ] [17.77959184] [17.97142857] [18.16326531] [18.35510204] [18.54693878] [18.73877551] [18.93061224] [19.12244898] [19.31428571] [19.50612245] [19.69795918] [19.88979592] [20.08163265] [20.27346939] [20.46530612] [20.65714286] [20.84897959] [21.04081633] [21.23265306] [21.4244898 ] [21.61632653] [21.80816327] [22. ]] Predicted values (y): [[35.16421874] [34.75039328] [34.33656781] [33.92274235] [33.50891688] [33.09509142] [32.68126595] [32.26744049] [31.85361502] [31.43978956] [31.02596409] [30.61213863] [30.19831316] [29.7844877 ] [29.37066224] [28.95683677] [28.54301131] [28.12918584] [27.71536038] [27.30153491] [26.88770945] [26.47388398] [26.06005852] [25.64623305] [25.23240759] [24.81858212] [24.40475666] [23.99093119] [23.57710573] [23.16328026] [22.7494548 ] [22.33562933] [21.92180387] [21.5079784 ] [21.09415294] [20.68032747] [20.26650201] [19.85267654] [19.43885108] [19.02502561] [18.61120015] [18.19737468] [17.78354922] [17.36972375] [16.95589829] [16.54207282] [16.12824736] [15.71442189] [15.30059643] [14.88677096]]